# coding:utf-8
from pyspark import SparkConf, SparkContext
import os

os.environ['JAVA_HOME'] = '/server/jdk'

if __name__ == '__main__':
    conf = SparkConf().setAppName('test').setAppName('local[*]')
    sc = SparkContext(conf=conf)

    rdd = sc.parallelize([('a',1),('a',1),('b',1),('b',1),('b',1),])

    rdd2 = rdd.groupByKey()
    print(rdd2.collect())

    # 通groupBy一样，都是迭代器
    print(rdd2.map(lambda x:(x[0],list(x[1]))).collect())  #与groupBy不一样的是在迭代器结果中没有保存Key